#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb

conn = MySQLdb.Connection("172.16.7.58", "root", "researchsucks", "X", 3306)
conn.set_character_set("utf8");
cur = conn.cursor()

out = open('/home/chaojiansong/videobook/notinsvn/user_tfvector', 'w')

# 获取所有存在收藏视频的用户id
cur.execute("select distinct user_id from user_fav")
user_ids = []
for user_id in cur.fetchall():
	user_ids.append(user_id[0])
    
# 获取每个用户的收藏视频baike_tag tf
tf = []
for user_id in user_ids:
    cur_tf = {}
    cur.execute("select video_id from user_fav where user_id='" + user_id + "'")
    video_ids = []
    for video_id in cur.fetchall():
        video_ids.append(video_id[0])
    for video_id in video_ids:
        cur.execute("select baike_tag from pop_history_baike_tags where video_id='" + video_id + "'" )
        for baike_tag in cur.fetchall():
            if baike_tag[0] in cur_tf:
                cur_tf[baike_tag[0]] = cur_tf[baike_tag[0]] + 1
            else:
                cur_tf[baike_tag[0]] = 1
    tf.append(cur_tf)

# 将user tf向量写入文件
user_cnt = len(user_ids)
for i in range(0, user_cnt):
    out.write('user: ' + user_ids[i] + '\n')
    for tag in tf[i]:
        out.write('tag: ' + tag + '\n')
        out.write('tf: ' + str(tf[i][tag]) + '\n')

out.close()
cur.close()
conn.close()
